import pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport numpy as npimport xml.etree.ElementTree as ETimport plotly.express as pximport plotly.graph_objects as gofrom scipy.stats import ttest_relfrom statsmodels.stats.weightstats import ttest_indimport numpy as npimport pingouin as pgfrom scipy.stats import zscoreimport plotly.graph_objects as goimport pandas as pdfrom plotly.subplots import make_subplotsimport warningsimport numpy as npimport pandas as pdfrom sklearn.decomposition import PCAfrom sklearn.cluster import KMeansfrom sklearn.preprocessing import StandardScalerimport matplotlib.pyplot as pltwarnings.filterwarnings("ignore")pd.set_option('display.max_columns', None)pd.set_option('display.precision', 10)
The Average Scenarios dataset averages all the numerical columns of the scenarios into one row, outputing one row for each Location, Year, and RCP. This dataset is used when conducting EDA and visualizing overtrend
# Convert to numeric, coercing errors to NaNnumeric_series = pd.to_numeric(df['RCP'], errors='coerce')numeric_series# Fill NaNs with original non-numeric valuesdf['RCP'] = numeric_series.fillna(df['RCP'])four = df[df['RCP'].isin([4.5])]eight = df[df['RCP'].isin([8.5])]four_h = df[df['RCP'].isin(['historical'])]four_h['RCP'] =4.5eight_h = df[df['RCP'].isin(['historical'])]eight_h['RCP'] =8.5df_orig = pd.concat([four_h, four, eight_h, eight], ignore_index=True)df_orig['Location_ID'] = df_orig.groupby(['long', 'lat']).ngroup() +1df_orig.head(5)
Park
long
lat
veg
year
TimePeriod
RCP
scenario
treecanopy
Ann_Herb
Bare
Herb
Litter
Shrub
El
Sa
Cl
RF
Slope
E
S
T_P_Corr
DrySoilDays_Winter_top50
DrySoilDays_Spring_top50
DrySoilDays_Summer_top50
DrySoilDays_Fall_top50
DrySoilDays_Winter_whole
DrySoilDays_Spring_whole
DrySoilDays_Summer_whole
DrySoilDays_Fall_whole
Evap_Winter
Evap_Spring
Evap_Summer
Evap_Fall
ExtremeShortTermDryStress_Winter_top50
ExtremeShortTermDryStress_Spring_top50
ExtremeShortTermDryStress_Summer_top50
ExtremeShortTermDryStress_Fall_top50
ExtremeShortTermDryStress_Winter_whole
ExtremeShortTermDryStress_Spring_whole
ExtremeShortTermDryStress_Summer_whole
ExtremeShortTermDryStress_Fall_whole
FrostDays_Winter
FrostDays_Spring
FrostDays_Summer
FrostDays_Fall
NonDrySWA_Winter_top50
NonDrySWA_Spring_top50
NonDrySWA_Summer_top50
NonDrySWA_Fall_top50
NonDrySWA_Winter_whole
NonDrySWA_Spring_whole
NonDrySWA_Summer_whole
NonDrySWA_Fall_whole
PET_Winter
PET_Spring
PET_Summer
PET_Fall
PPT_Winter
PPT_Spring
PPT_Summer
PPT_Fall
SemiDryDuration_Annual_top50
SemiDryDuration_Annual_whole
SWA_Winter_top50
SWA_Spring_top50
SWA_Summer_top50
SWA_Fall_top50
SWA_Winter_whole
SWA_Spring_whole
SWA_Summer_whole
SWA_Fall_whole
T_Winter
T_Spring
T_Summer
T_Fall
Tmax_Winter
Tmax_Spring
Tmax_Summer
Tmax_Fall
Tmin_Winter
Tmin_Spring
Tmin_Summer
Tmin_Fall
Transp_Winter
Transp_Spring
Transp_Summer
Transp_Fall
VWC_Winter_top50
VWC_Spring_top50
VWC_Summer_top50
VWC_Fall_top50
VWC_Winter_whole
VWC_Spring_whole
VWC_Summer_whole
VWC_Fall_whole
WetSoilDays_Winter_top50
WetSoilDays_Spring_top50
WetSoilDays_Summer_top50
WetSoilDays_Fall_top50
WetSoilDays_Winter_whole
WetSoilDays_Spring_whole
WetSoilDays_Summer_whole
WetSoilDays_Fall_whole
PPT_Annual
T_Annual
RL
Location_ID
0
NABR
-110.0472
37.60413
Shrubland
1980
Hist
4.5
sc1
0
0
84
5
11
7
1764.955
77.03307
6.082058
2.285707
1949.283
-8753.784
4834.13
-0.6636760860
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3.7140658366
6.3995308949
1.5598074021
3.3632779979
NaN
24.34
36.16
29.52
NaN
24.34
36.16
29.52
75.0
34.0
0.0
26.0
3.4668806371
2.6546632530
0.0321140671
0.4880867481
3.4668806371
2.6546632530
0.0321140671
0.4880867481
7.7811633032
31.1394527955
48.0177480655
21.9156825756
13.79
8.71
2.69
6.37
36.5000000000
36.5000000000
3.4668806371
2.6546632530
0.0321140671
0.4880867481
3.4668806371
2.6546632530
0.0321140671
0.4880867481
0.96483520
8.767935
23.15924
11.962090
14.15
28.75
37.05
31.15
-12.45
-7.35
5.55
-10.25
0.2370806
5.296833
1.067496
1.9667860
0.1134468701
0.0968307001
0.0418759016
0.0522975530
0.1134468701
0.0968307001
0.0418759016
0.0522975530
91.0
77.0
5.0
47.0
91.0
77.0
5.0
47.0
31.56
11.21352505
54.57202074
1
1
NABR
-110.0472
37.60413
Shrubland
1981
Hist
4.5
sc1
0
0
84
5
11
7
1764.955
77.03307
6.082058
2.285707
1949.283
-8753.784
4834.13
0.3478010620
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2.1815202084
5.9723378265
5.0428776741
4.6374034668
13.92
26.53
36.08
NaN
13.92
26.53
36.08
NaN
79.0
26.0
0.0
13.0
0.3461917264
0.8982752558
0.0336629893
2.5013360811
0.3461917264
0.8982752558
0.0336629893
2.5013360811
8.1229049607
32.3882557036
48.1772426406
21.7575735702
2.25
9.81
9.39
11.75
13.2500000000
13.2500000000
0.3461917264
0.8982752558
0.0336629893
2.5013360811
0.3461917264
0.8982752558
0.0336629893
2.5013360811
3.33444400
10.548370
23.27065
11.581320
17.05
28.15
37.55
29.75
-9.35
-5.55
1.25
-7.25
0.2930753
3.506108
3.916328
2.7875470
0.0493818430
0.0607271763
0.0426386771
0.0936706801
0.0493818430
0.0607271763
0.0426386771
0.0936706801
48.0
60.0
13.0
85.0
48.0
60.0
13.0
85.0
33.20
12.18369600
54.57202074
1
2
NABR
-110.0472
37.60413
Shrubland
1982
Hist
4.5
sc1
0
0
84
5
11
7
1764.955
77.03307
6.082058
2.285707
1949.283
-8753.784
4834.13
0.3260300992
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3.2589947135
4.7173273934
4.5276363327
4.2477717540
NaN
26.19
34.99
22.06
NaN
26.19
34.99
22.06
83.0
21.0
0.0
30.0
3.2599844936
1.5994982052
0.1993822366
1.2432253150
3.2599844936
1.5994982052
0.1993822366
1.2432253150
7.3379526955
31.4894498184
47.1800768757
21.0684231651
4.12
5.10
9.50
9.83
17.2857142857
17.2857142857
3.2599844936
1.5994982052
0.1993822366
1.2432253150
3.2599844936
1.5994982052
0.1993822366
1.2432253150
-0.01555556
9.472283
22.05707
9.869231
14.35
28.45
36.65
31.85
-16.55
-7.25
5.65
-6.25
0.2453347
3.105047
3.523923
2.8900990
0.1092341982
0.0748166564
0.0456102615
0.0677891794
0.1092341982
0.0748166564
0.0456102615
0.0677891794
90.0
62.0
19.0
73.0
90.0
62.0
19.0
73.0
28.55
10.34575711
54.57202074
1
3
NABR
-110.0472
37.60413
Shrubland
1983
Hist
4.5
sc1
0
0
84
5
11
7
1764.955
77.03307
6.082058
2.285707
1949.283
-8753.784
4834.13
0.0388273872
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3.7419915365
6.2671578978
5.1695757094
3.7751048188
NaN
28.56
33.69
31.02
NaN
28.56
33.69
31.02
85.0
32.0
0.0
19.0
3.8064480379
2.9456592119
0.0960442305
1.5835966242
3.8064480379
2.9456592119
0.0960442305
1.5835966242
7.4798456947
30.3128312703
46.5762368398
21.8471460016
7.09
10.80
10.22
10.40
16.7142857143
16.7142857143
3.8064480379
2.9456592119
0.0960442305
1.5835966242
3.8064480379
2.9456592119
0.0960442305
1.5835966242
0.40944440
8.020652
21.32826
11.325820
13.35
30.65
34.55
33.15
-15.05
-7.25
3.85
-8.95
0.2252735
4.962824
5.006576
1.1952350
0.1204177901
0.1025422325
0.0441405046
0.0748017843
0.1204177901
0.1025422325
0.0441405046
0.0748017843
90.0
74.0
15.0
69.0
90.0
74.0
15.0
69.0
38.51
10.27104410
54.57202074
1
4
NABR
-110.0472
37.60413
Shrubland
1984
Hist
4.5
sc1
0
0
84
5
11
7
1764.955
77.03307
6.082058
2.285707
1949.283
-8753.784
4834.13
0.2166602692
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3.6272686835
5.0078604793
5.2303324404
4.0803373430
NaN
30.95
34.01
29.52
NaN
30.95
34.01
29.52
91.0
35.0
0.0
30.0
3.7945975224
1.7555326024
0.0452782946
1.3792575946
3.7945975224
1.7555326024
0.0452782946
1.3792575946
7.1730101555
31.9972417196
47.0386757592
21.0183982059
4.77
4.32
9.49
8.17
16.5000000000
16.5000000000
3.7945975224
1.7555326024
0.0452782946
1.3792575946
3.7945975224
1.7555326024
0.0452782946
1.3792575946
-1.04725300
9.853804
21.95978
10.034070
10.25
32.75
35.35
31.35
-18.45
-8.45
2.95
-12.45
0.1226868
3.120243
4.269040
0.9273169
0.1202091711
0.0778415354
0.0431793330
0.0703661709
0.1202091711
0.0778415354
0.0431793330
0.0703661709
91.0
65.0
16.0
62.0
91.0
65.0
16.0
62.0
26.75
10.20010025
54.57202074
1
Data Exploration
Basic Statistics
Basic Statistics - 79 years of prediction (2021~2099) - 40 scenarios (sc22~sc61) - 2 RCP scenarios(4.5, 8.5) - 113 locations
Explanation - The data is collected over 113 locations within the Natural Bridge National Monument. (Number of Unique latitude, longitude combinations) - This dataset is composed of 41 years of historical data and 79 years worth of predictions. Since there can be only one scenario for past data, all historical data is labeled as ‘sc1’ or scenario one - From the predicted years (2021 to 2099), There are two RCP scenarios which changes only the RCP variable and 40 scenarios which simulate 86 other variables.
Based on each combination of scenarios, a prediction is made for each location point regarding various columns of the dataset including annual and seasonal percipitation, seasonal dry soil days, seasonal evaporation, seasonal extreme short term dry stress, soil water availability to output a final prediction for Annual and seasonal temperatures.
What is RCP?
Representative Concentration Pathways : A group of scenarios where CO2 emmission is predicted like the image below - The dataset consists of two RCP scenarios 4.5 and 8.5
source : Representative Concentration Pathway. (2024, May 2). In Wikipedia. https://en.wikipedia.org/wiki/Representative_Concentration_Pathway
Location
Where is this data located and how does it look like?
The data points were sampled at the Natural Bridge National Monument in Utah. And for a better idea of The plots below show two different location aspects of the dataset. The first plot is the average annual temperature for each location point in the year 2099. Since the temperature for predictions increase over time, the last year for the dataset was chosen for a more dramatic comparison
The second plot is a scatter plot of the locations of vegetations. By comparing the two graphs, we can tell that there isn’t much correlation with vegetation and annual temperature but rather a correlation with the location(latitude, longitude) and temperature. We will get to this in the following visualizations.
Map Visualizations
map= df_con[df_con['year']==2099].groupby(['long','lat'])['T_Annual'].mean().reset_index()filtered_df =mapfig = px.scatter_mapbox(filtered_df, lat="lat", lon="long", color="T_Annual", size="T_Annual", color_continuous_scale=px.colors.cyclical.IceFire, size_max=8, zoom=11, mapbox_style="open-street-map")fig.update_layout( title={'text': "<b>Average Temperature (2099) </b>",'y': 0.97,'x': 0.5,'xanchor': 'center','yanchor': 'top' }, margin={"r": 0, "t": 40, "l": 0, "b": 0} )fig.show()map= df_con[df_con['year']==2099].groupby(['long','lat','veg']).size().reset_index()filtered_df =map# Create the scatter mapboxfig = px.scatter_mapbox(map, lat="lat", lon="long", color="veg", color_continuous_scale=px.colors.cyclical.IceFire, size_max=8, zoom=11, mapbox_style="open-street-map")# Update the layout with the new legend title and positionfig.update_layout( title={'text': "<b>Vegetation Location</b>",'y': 0.97,'x': 0.5,'xanchor': 'center','yanchor': 'top' }, coloraxis_colorbar={'title': 'Vegetation Level'# Change this to your desired legend title }, legend={'x': 1, # Position the legend to the right'y': 0.8, # Center the legend vertically'xanchor': 'left', # Anchor the legend's x position to the left side'yanchor': 'middle'# Anchor the legend's y position to the middle }, margin={"r": 0, "t": 40, "l": 0, "b": 0})fig.update_traces(marker=dict(size=10)) # Set the desired fixed marker size# Show the figurefig.show()
Temperature/Percipitation Trends
The following plots were drawn by average all scenarios, locations, and RCPs for a given year for annual temperature and annual percipitation to see the overall trend of the predictions of the dataset. Predictions were made from the year 2021 which is
We can see that the predictions portray an increase in temperature but a fluctuation with percipitation allowing us to make an educated guess that temperature is the more important variable for RCP scenarios which deal with CO2 emission.
Temperature / Percipitation Predictions Overview
# Assuming 'veg_location' is your DataFrame# Filter the DataFrame for 'RCP' values 'historical' and 4.5filtered_data = df_con.groupby(['year'])['T_Annual'].mean().reset_index()# Create the line plotfig = px.line( data_frame=filtered_data, x='year', y='T_Annual', title='<b>Annual Temperature</b>', labels={'T_Annual': 'Annual Temperature'}, line_shape='spline')# Add a vertical line at year 2021fig.add_shape(dict(type='line', x0=2021, y0=filtered_data['T_Annual'].min()/1.1, x1=2021, y1=filtered_data['T_Annual'].max()*1.1, line=dict( color="Red", width=2, dash="dash", ), ))fig.add_annotation(dict( x=2021, # Position the text to the right of the line y=filtered_data['T_Annual'].max(), # Position the text at the middle of the y-axis xref="x", yref="y", text="Prediction", showarrow=False, font=dict( size=12, color="Red" ), align="center", xanchor="left" ))fig.update_layout(title={'x':0.5})# Show the plotfig.show()# Assuming 'veg_location' is your DataFrame# Filter the DataFrame for 'RCP' values 'historical' and 4.5filtered_data = df_con.groupby(['year'])['PPT_Annual'].mean().reset_index()# Create the line plotfig = px.line( data_frame=filtered_data, x='year', y='PPT_Annual', title='<b>Annual Precipitation</b>', labels={'T_Annual': 'Annual Temperature'}, line_shape='spline')# Add a vertical line at year 2021fig.add_shape(dict(type='line', x0=2021, y0=filtered_data['PPT_Annual'].min()/1.1, x1=2021, y1=filtered_data['PPT_Annual'].max()*1.1, line=dict( color="Red", width=2, dash="dash", ), ))fig.add_annotation(dict( x=2021, # Position the text to the right of the line y=filtered_data['PPT_Annual'].max(), # Position the text at the middle of the y-axis xref="x", yref="y", text="Prediction", showarrow=False, font=dict( size=12, color="Red" ), align="center", xanchor="left" ))fig.update_layout(title={'x':0.5})# Show the plotfig.show()
Perspectives to Consider
What are some aspects of the datasets we can slice and dice or drill down to compare and retrieve meaningful insights?
Most numerical features are generated based on the scenario of the model that generated future data, and some numerical features such ase S,E,Slope, RF, Cl, Sa, El, treecanopy etc. are features that are fixed according to a unique location. Therefore categorical variables are the aspects of the datasets we can compare to create new insights
The following plots compare the predicted annual temperature for each category for the three categorical variables
Temperature RCP comparison
# Assuming 'veg_location' is your DataFrame# Filter the DataFrame for 'RCP' values 'historical' and 4.5filtered_data = df_con.groupby(['year','RCP'])['T_Annual'].mean().reset_index()# Create the line plotfig = px.line( data_frame=filtered_data, x='year', y='T_Annual', color='RCP', # This will create lines for each unique value in 'veg' and color them differently title='<b>Annual Temperature by RCP Type</b>', labels={'T_Annual': 'Annual Temperature'}, line_shape='spline')fig.update_layout(title={'x':0.5})# Add a vertical line at year 2021fig.add_shape(dict(type='line', x0=2021, y0=filtered_data['T_Annual'].min()/1.1, x1=2021, y1=filtered_data['T_Annual'].max()*1.1, line=dict( color="Red", width=2, dash="dash", ), ))fig.add_annotation(dict( x=2021, # Position the text to the right of the line y=filtered_data['T_Annual'].max(), # Position the text at the middle of the y-axis xref="x", yref="y", text="Prediction", showarrow=False, font=dict( size=12, color="Red" ), align="center", xanchor="left" ))# Show the plotfig.show()
Since RCP deals with CO2 emission and the 8.5 scenario has a higher emission prediction than the 4.5 scenario, the annual temperature increase of rcp8.5 is more rapid than rcp4.5 although both are increasing.
Temperature comparison (Vegetation)
# Filter the DataFrame for 'RCP' values 'historical' and 4.5filtered_data = df_con[df_con['RCP'].isin(['historical', 4.5])].groupby(['year','veg'])['T_Annual'].mean().reset_index()# Create the line plotfig = px.line( data_frame=filtered_data, x='year', y='T_Annual', color='veg', # This will create lines for each unique value in 'veg' and color them differently title='<b>Annual Temperature by Vegetation Type</b>', labels={'T_Annual': 'Annual Temperature'})fig.update_layout(title={'x':0.5})# Add a vertical line at year 2021fig.add_shape(dict(type='line', x0=2021, y0=filtered_data['T_Annual'].min()/1.1, x1=2021, y1=filtered_data['T_Annual'].max()*1.1, line=dict( color="Red", width=2, dash="dash", ), ))fig.add_annotation(dict( x=2021, # Position the text to the right of the line y=filtered_data['T_Annual'].max(), # Position the text at the middle of the y-axis xref="x", yref="y", text="Prediction", showarrow=False, font=dict( size=12, color="Red" ), align="center", xanchor="left" ))# Show the plotfig.show()
The vegetations seem to follow exactly the same pattern of prediciton with a fixed difference between each other. This may mean that when calculating predictions based on scenarios, the algorithm was modeled so that the mean of the vegetations were always a given distance apart from each other. Because of this limitation of the algorithm, it is unncessary to compare vegetations from each other. We will always get the same difference.
Temperature comparison (scenario)
# Assuming df_orig is your DataFrame and it has been filtered to exclude 'Hist' from 'TimePeriod'df_filtered = df_orig[df_orig['TimePeriod'] !='Hist']# Calculate the median of 'T_Annual' for each 'scenario'medians = df_filtered.groupby('scenario')['T_Annual'].median().reset_index()# Sort the median valuesmedians = medians.sort_values('T_Annual')# Merge sorted median DataFrame back to the filtered DataFramedf_sorted = pd.merge(medians['scenario'], df_filtered, on='scenario', how='left')# Creating a boxplot with Plotly Express using the sorted DataFramefig = px.box(df_sorted, x='scenario', y='T_Annual', color='RCP')# Rotating x-axis labelsfig.update_layout( xaxis_tickangle=-90, title={'text': "<b>Annual Temperature by Scenario</b>",'x':0.5,'xanchor': 'center' })# Displaying the plotfig.show()
Since we already know that RCP plays a big role in how the algorithm predicts the temperature, We will divide the scenarios into 4.5 scenarios and 8.5 scenarios to see if there is a significant difference. By examining the plot we now know that RCP 4.5 correspons to scenario 22~41 and RCP 8.5 correspons to scenario 42~61. There are cases where 4.5 scenarios had higher temperatures than 8.5 scenarios, but since RCP acts as the first drill down layer of the dataset, we can use the scenario as the second drilldown of the dataset.
For our dataset analysis, we will be comparing the maximum and minimum scenario for each RCP group to analyze what features affect temperature the most. That is comparing scenario 37 to scenario 40 for RCP 4.5 scenarios, and comparing scenario 58 to scenario 60 to do the same for RCP 8.5.
Statistical Significance
Is there a significant difference between different scenarios?
Before we start analyzing our dataset, one final step we want to take is proving the statistical significance in the different scenarios we plan on comparing.
The three comparisons we plan on making are as follows: 1. RCP 8.5(High) vs RCP 4.5(Low) 2. RCP 4.5 : Scenario 37(High) vs Scenario 40(Low) 2. RCP 8.5 : Scenario 60(High) vs Scenario 58(Low)
import plotly.graph_objects as goimport numpy as npimport pandas as pdfrom sklearn.decomposition import PCAfrom sklearn.cluster import KMeansfrom sklearn.preprocessing import StandardScalerimport plotly.express as pxdata = df_orig[(df_orig['RCP']==4.5) & (df_orig['year'].isin(range(2095,2100)))].dropna(axis=1, how='any')X = data.iloc[:,list(range(1, 3))+ [4,6] +list(range(8, len(data.columns)-3))]# Standardize the featuresscaler = StandardScaler()X_scaled = scaler.fit_transform(X)# Apply PCApca = PCA(n_components=10) # Reduce to 2 components for visualizationX_pca = pca.fit_transform(X_scaled)# Apply k-means clusteringkmeans = KMeans(n_clusters=2, random_state=0)clusters = kmeans.fit_predict(X_pca)# Add the PCA and cluster results to the DataFramedata['PCA1'] = X_pca[:, 0]data['PCA2'] = X_pca[:, 1]data['PCA3'] = X_pca[:, 2]# Get the component loadingsloadings = pca.components_.Tcolumns = X.columns# Create a DataFrame for loadingsloadings_df = pd.DataFrame(loadings, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8', 'PCA9', 'PCA10'], index=columns)# Get the explained variance ratioexplained_variance_ratio = pca.explained_variance_ratio_# Cumulative explained variancecumulative_explained_variance = np.cumsum(explained_variance_ratio)# Create a list of x-axis labels from PCA1 to PCA10x_labels = [f'PCA{i+1}'for i inrange(len(explained_variance_ratio))]# Create a line chart for explained variance ratiofig = go.Figure(data=go.Scatter( x=x_labels, y=explained_variance_ratio, mode='lines+markers', text=explained_variance_ratio, textposition='top center'))# Update layout for better readabilityfig.update_layout( title='<b>Explained Variance Ratio by Principal Components</b>', xaxis_title='Principal Components', yaxis_title='Explained Variance Ratio', yaxis=dict(tickformat=".2%", range=[0, 1.1* explained_variance_ratio.max()]), # Adjust the range as needed template='plotly_white', margin=dict(l=50, r=50, b=50, t=50) # Adjust the padding as needed)fig.show()
Code
# Visualize the results with Plotlyfig = px.scatter( data, x='PCA1', y='PCA2', color='scenario', title='<b>PCA and k-means Clustering</b>', labels={'PCA1': 'PCA1', 'PCA2': 'PCA2'}, opacity=0.5)fig.update_layout( margin=dict(l=10, r=10, b=10, t=40), # Adjust the values as needed title_x=0.5, title_y=0.95,)fig.show()
Code
# Visualize the results with Plotly in 3Dfig = px.scatter_3d( data, x='PCA1', y='PCA2', z='PCA3', color='scenario', title='<b>3D PCA with Scenario</b>', labels={'PCA1': 'PCA1', 'PCA2': 'PCA2', 'PCA3': 'PCA3'}, opacity=0.5, size_max=0.1)fig.update_traces(marker=dict(size=3)) # Adjust the size value as needed# Update layout to adjust padding and marginsfig.update_layout( margin=dict(l=5, r=5, b=5, t=20), # Adjust the values as needed title_x=0.5, title_y=0.95, scene=dict( xaxis=dict(title='PCA1'), yaxis=dict(title='PCA2'), zaxis=dict(title='PCA3') ))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA1'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA1'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA1</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA2'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA2'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA2</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA3'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA3'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA3</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Write about how the scenarios differ based on PCA
Code
# Calculate the correlation matrixcorr_matrix = df_orig[(df_orig['TimePeriod']!='Hist') & (df_orig['RCP']==4.5)].iloc[:,8:].corr()# Create an interactive heatmap of the correlation matrixfig = px.imshow(corr_matrix,# text_auto=True, # Automatically add text in each cell labels=dict(color="Correlation"), x=corr_matrix.columns, y=corr_matrix.columns, color_continuous_scale='RdBu_r' ) # Red-Blue color map, reversedfig.update_layout( width=900, # Width of the figure in pixels height=900, # Height of the figure in pixels margin=dict(l=10, r=1, t=50, b=10) # Reducing margins around the plot)# Adjusting color bar position# fig.update_layout(coloraxis_colorbar=dict(# x=0.8 # Adjusts the horizontal position of the color bar# ))fig.update_layout(title_text='<b>Future Correlation Heatmap : RCP 4.5</b>', # Bold text using HTML title_x=0.5) # Centers the title by setting the x position to 0.5fig.update_xaxes(tickfont=dict(size=10)) # Sets the font size of x-axis labelsfig.update_yaxes(tickfont=dict(size=10)) # Sets the font size of y-axis labels# fig.update_xaxes(side="bottom")fig.show()
RMSE
Another method I chose to see which values were most different between two scenarios was using the RMSE to find the columns with the greatest difference since Each scenario has the same number of data points. The data points were standardized and used for RMSE calculation.
Code
sc60 = df_orig[df_orig['scenario'] =='sc60']sc58 = df_orig[df_orig['scenario'] =='sc58']df1 = sc60.iloc[:,8:-3]df2 = sc58.iloc[:,8:-3]# Function to calculate z-scoresdef standardize(df):return df.apply(zscore)# Standardize both dataframesz_df1 = standardize(df1)z_df2 = standardize(df2)# Calculate Absolute Difference of Z-Scoresabs_diff_z_scores = np.abs(z_df1 - z_df2)# Mean Absolute Differencemean_abs_diff = abs_diff_z_scores.mean()# RMSErmse = np.sqrt(np.mean((z_df1.reset_index(drop=True) - z_df2.reset_index(drop=True))**2, axis=0))rmse_sort = rmse.sort_values(ascending=False).head(30)# Create a bar chartfig = go.Figure(data=[go.Bar( x=rmse_sort.keys(), y=rmse_sort.values, text=[round(i,4) for i in rmse_sort.values], # Show the actual values as text textposition='inside',# marker_color=colors, showlegend=False)])# Update layout for better readabilityfig.update_layout( title='<b>Scenario 58 vs 60 RMSE of components</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
RCP = 8.5 (Analysis)
Code
import plotly.graph_objects as goimport numpy as npimport pandas as pdfrom sklearn.decomposition import PCAfrom sklearn.cluster import KMeansfrom sklearn.preprocessing import StandardScalerimport plotly.express as pxdata = df_orig[(df_orig['RCP']==8.5) & (df_orig['year'].isin(range(2095,2100)))].dropna(axis=1, how='any')X = data.iloc[:,list(range(1, 3))+ [4,6] +list(range(8, len(data.columns)-3))]# Standardize the featuresscaler = StandardScaler()X_scaled = scaler.fit_transform(X)# Apply PCApca = PCA(n_components=10) # Reduce to 2 components for visualizationX_pca = pca.fit_transform(X_scaled)# Apply k-means clusteringkmeans = KMeans(n_clusters=2, random_state=0)clusters = kmeans.fit_predict(X_pca)# Add the PCA and cluster results to the DataFramedata['PCA1'] = X_pca[:, 0]data['PCA2'] = X_pca[:, 1]data['PCA3'] = X_pca[:, 2]# Get the component loadingsloadings = pca.components_.Tcolumns = X.columns# Create a DataFrame for loadingsloadings_df = pd.DataFrame(loadings, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5', 'PCA6', 'PCA7', 'PCA8', 'PCA9', 'PCA10'], index=columns)# Get the explained variance ratioexplained_variance_ratio = pca.explained_variance_ratio_# Cumulative explained variancecumulative_explained_variance = np.cumsum(explained_variance_ratio)# Create a list of x-axis labels from PCA1 to PCA10x_labels = [f'PCA{i+1}'for i inrange(len(explained_variance_ratio))]# Create a line chart for explained variance ratiofig = go.Figure(data=go.Scatter( x=x_labels, y=explained_variance_ratio, mode='lines+markers', text=explained_variance_ratio, textposition='top center'))# Update layout for better readabilityfig.update_layout( title='<b>Explained Variance Ratio by Principal Components</b>', xaxis_title='Principal Components', yaxis_title='Explained Variance Ratio', yaxis=dict(tickformat=".2%", range=[0, 1.1* explained_variance_ratio.max()]), # Adjust the range as needed template='plotly_white', margin=dict(l=50, r=50, b=50, t=50) # Adjust the padding as needed)fig.show()
Code
# Visualize the results with Plotlyfig = px.scatter( data, x='PCA1', y='PCA2', color='scenario', title='<b>PCA and k-means Clustering</b>', labels={'PCA1': 'PCA1', 'PCA2': 'PCA2'}, opacity=0.5)fig.update_layout( margin=dict(l=10, r=10, b=10, t=40), # Adjust the values as needed title_x=0.5, title_y=0.95,)fig.show()
Code
# Visualize the results with Plotly in 3Dfig = px.scatter_3d( data, x='PCA1', y='PCA2', z='PCA3', color='scenario', title='<b>3D PCA with Scenario</b>', labels={'PCA1': 'PCA1', 'PCA2': 'PCA2', 'PCA3': 'PCA3'}, opacity=0.5, size_max=0.1)fig.update_traces(marker=dict(size=3)) # Adjust the size value as needed# Update layout to adjust padding and marginsfig.update_layout( margin=dict(l=5, r=5, b=5, t=20), # Adjust the values as needed title_x=0.5, title_y=0.95, scene=dict( xaxis=dict(title='PCA1'), yaxis=dict(title='PCA2'), zaxis=dict(title='PCA3') ))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA1'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA1'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA1</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA2'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA2'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA2</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Code
# Sort the features by the absolute value of the loading for PCA1sorted_loadings = loadings_df['PCA3'].abs().sort_values(ascending=False)# Get the top 10 most influential featurestop_features = sorted_loadings.head(30).index# Get the actual loadings for these top 10 featurestop_loadings =round(loadings_df.loc[top_features, 'PCA3'],4)# Create a color list based on the sign of the loadingscolors = ['blue'if val >0else'red'for val in top_loadings]# Create a bar chartfig = go.Figure(data=[go.Bar( x=top_loadings.index, y=top_loadings.abs(), text=top_loadings.values, # Show the actual values as text textposition='inside', marker_color=colors, showlegend=False)])# Add legend manuallyfig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='blue'), showlegend=True, name='Positive'))fig.add_trace(go.Bar( x=[None], y=[None], marker=dict(color='red'), showlegend=True, name='Negative'))# Update layout for better readabilityfig.update_layout( title='<b>Top 20 Most Influential Features on PCA3</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
Write about how the scenarios differ based on PCA
Correlation Heatmap
Code
# Calculate the correlation matrixcorr_matrix = df_orig[(df_orig['TimePeriod']!='Hist') & (df_orig['RCP']==8.5)].iloc[:,8:].corr()# Create an interactive heatmap of the correlation matrixfig = px.imshow(corr_matrix,# text_auto=True, # Automatically add text in each cell labels=dict(color="Correlation"), x=corr_matrix.columns, y=corr_matrix.columns, color_continuous_scale='RdBu_r' ) # Red-Blue color map, reversedfig.update_layout( width=900, # Width of the figure in pixels height=900, # Height of the figure in pixels margin=dict(l=10, r=1, t=50, b=10) # Reducing margins around the plot)# Adjusting color bar position# fig.update_layout(coloraxis_colorbar=dict(# x=0.8 # Adjusts the horizontal position of the color bar# ))fig.update_layout(title_text='<b>Future Correlation Heatmap : RCP 8.5</b>', # Bold text using HTML title_x=0.5) # Centers the title by setting the x position to 0.5fig.update_xaxes(tickfont=dict(size=10)) # Sets the font size of x-axis labelsfig.update_yaxes(tickfont=dict(size=10)) # Sets the font size of y-axis labels# fig.update_xaxes(side="bottom")fig.show()
RMSE
Code
sc40 = df_orig[df_orig['scenario'] =='sc40']sc37 = df_orig[df_orig['scenario'] =='sc37']df1 = sc40.iloc[:,8:-3]df2 = sc37.iloc[:,8:-3]# Function to calculate z-scoresdef standardize(df):return df.apply(zscore)# Standardize both dataframesz_df1 = standardize(df1)z_df2 = standardize(df2)# Calculate Absolute Difference of Z-Scoresabs_diff_z_scores = np.abs(z_df1 - z_df2)# Mean Absolute Differencemean_abs_diff = abs_diff_z_scores.mean()# RMSErmse = np.sqrt(np.mean((z_df1.reset_index(drop=True) - z_df2.reset_index(drop=True))**2, axis=0))rmse_sort = rmse.sort_values(ascending=False).head(30)# Create a bar chartfig = go.Figure(data=[go.Bar( x=rmse_sort.keys(), y=rmse_sort.values, text=[round(i,4) for i in rmse_sort.values], # Show the actual values as text textposition='inside',# marker_color=colors, showlegend=False)])# Update layout for better readabilityfig.update_layout( title='<b>Scenario 40 vs 37 RMSE of components</b>', xaxis_title='Features', yaxis_title='Absolute PCA1 Loadings', yaxis=dict(tickformat=".2f"), template='plotly_white', legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ), margin=dict(l=20, r=20, b=20, t=60))fig.show()
# import plotly.express as px# import plotly.graph_objects as go# from plotly.subplots import make_subplots# # List of columns to use for coloring# color_columns = list(df_con.columns)# # Creating the initial scatter plot with the first color column# initial_color = color_columns[0]# fig = px.scatter(# df_con[df_con['year'].isin(range(2060, 2099))],# x="PPT_Annual",# y="T_Annual",# # color=initial_color,# facet_col="RCP"# )# # Updating the layout to add the title# fig.update_layout(# title={# 'text': '<b>Annual Precipitation vs Temperature by RCP Scenarios</b>',# 'x': 0.5,# 'xanchor': 'center'# },# title_font=dict(size=20)# )# # Adding dropdown filter to change color based on selected column# dropdown_buttons = [# {# 'label': column,# 'method': 'restyle',# 'args': [# {'marker': {'color': df_con[column]}}, # Update the color# {}, # Leave empty to avoid breaking restyle# # [0] # Apply to the first trace# ],# 'args2': [# {'marker': {'color': df_con[initial_color]}}, # Revert to initial color if needed# {},# # [1]# ]# }# for column in color_columns# ]# fig.update_layout(# updatemenus=[# {# 'buttons': dropdown_buttons,# 'direction': 'down',# 'showactive': True,# 'x': 1,# 'xanchor': 'center',# 'y': 1.15,# 'yanchor': 'top'# }# ]# )# # Show the figure# fig.show()
Code
# Assuming df_con is your DataFrame and is already loaded# List of columns to use for coloringtest = df_con.iloc[:,list(range(1, 3))+ [4,6] +list(range(8, len(df_orig.columns)-1))]color_columns =list(test.columns)rcp_values = test['RCP'].unique()subplot_titles = [f'RCP {rcp}'for rcp in rcp_values]# Create figure with subplots for each RCP valuefig = make_subplots(rows=1, cols=len(rcp_values), shared_yaxes=True, subplot_titles=subplot_titles, horizontal_spacing=0.15)# Add a scatter trace for each color column and each RCP valuefor i, col inenumerate(color_columns):for j, rcp inenumerate(rcp_values): fig.add_trace( go.Scatter( x=test[(test['year'].isin(range(2060, 2100))) & (test['RCP'] == rcp)]['PPT_Annual'], y=test[(test['year'].isin(range(2060, 2100))) & (test['RCP'] == rcp)]['T_Annual'], mode='markers', marker=dict( color=test[(test['year'].isin(range(2060, 2100))) & (test['RCP'] == rcp)][col], colorbar=dict(# title='Scale', tickmode='array', tickvals=[round(i,2) for i in np.linspace(start=round(min(test[(test['year'].isin(range(2060, 2100)) & (test['RCP'] == rcp))][col]),2),stop=round(max(test[(test['year'].isin(range(2060, 2100)) & (test['RCP'] == rcp))][col]),2),num=5)], ticktext=[round(i,2) for i in np.linspace(start=round(min(test[(test['year'].isin(range(2060, 2100)) & (test['RCP'] == rcp))][col]),2),stop=round(max(test[(test['year'].isin(range(2060, 2100)) & (test['RCP'] == rcp))][col]),2),num=5)], y=0.5, x=0.43+ (j*0.58) ), colorscale='rdpu' ), name=col, visible=Trueif i ==0elseFalse, hovertemplate=(f"<b>{col}</b><br>""Precipitation: %{x}<br>""Temperature: %{y}<br>""RCP: "+str(rcp) +"<br>""Value: %{marker.color}<br>""<extra></extra>" ) # This hides the secondary box with trace info # Only the first trace is visible initially ), row=1, col=j+1 )# Updating the layout to add the titlefig.update_layout( title={'text': '<b>Annual Precipitation vs Temperature by RCP Scenarios</b>','x': 0.5,'y': 0.97,'xanchor': 'center' },# title_font=dict(size=20), showlegend=False# Hide legend since we are using colorbars)# Adding dropdown filter to change visible tracedropdown_buttons = [ {'label': col,'method': 'update','args': [ {'visible': [col == color_column for color_column in color_columns for _ in rcp_values] }, {'title': {'text': f'<b>Annual Precipitation vs Temperature by {col}</b>', 'x':0.5},'marker': {'colorbar': {'title': 'Scale'}} } ] }for col in color_columns]fig.update_layout( updatemenus=[ {'buttons': dropdown_buttons,'direction': 'down','showactive': True,'x': 0.5,'xanchor': 'center','y': 1.16,'yanchor': 'top' } ])fig.update_xaxes(title_text="Annual Precipitation", row=1, col=1)fig.update_yaxes(title_text="Annual Temperature", row=1, col=1)fig.update_xaxes(title_text="Annual Precipitation", row=1, col=2)for annotation in fig['layout']['annotations']: annotation['font'] = {'size': 12, 'color': 'black'}# Show the figurefig.show()